From 9aed0e5eb66c57e4352a5bc293d157eadba77abd Mon Sep 17 00:00:00 2001 From: Ell Date: Tue, 7 Nov 2017 15:44:39 -0500 Subject: [PATCH] babl, sse2-float: fall back to slow, accurate path for large pow-2.4 inputs The approximations we use for pow_24() and pow_1_24() diverge from the actual function for large-enough input values. This can lead not just to inaccurate results, but also to infinities and NaNs, especially when multiple conversions are strung in a row. When the input value is large enough to produce notable divergence (the difference between the approximate and actual values is ~1% at the chosen limits,) fall back to a slower, but more accurate version. For the SSE2 float conversions, this results in an increase of ~5% in conversion time, when all values are below the limit. When most values are above the limit, performance can be 10x slower or worse. --- babl/base/pow-24.c | 28 ++++++++++++++++++++++++---- babl/base/pow-24.h | 28 ++++++++++++++++++++++++---- extensions/sse2-float.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 8 deletions(-) diff --git a/babl/base/pow-24.c b/babl/base/pow-24.c index dbc4071..5a4f0ad 100644 --- a/babl/base/pow-24.c +++ b/babl/base/pow-24.c @@ -48,8 +48,13 @@ init_newton (double x, double exponent, double c0, double c1, double c2) double babl_pow_24 (double x) { - double y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332); + double y; int i; + if (x > 16.0) { + /* for large values, fall back to a slower but more accurate version */ + return exp (log (x) * 2.4); + } + y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332); for (i = 0; i < 3; i++) y = (1.+1./5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y)); x *= y; @@ -61,9 +66,14 @@ babl_pow_24 (double x) double babl_pow_1_24 (double x) { - double y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383); + double y; int i; double z; + if (x > 1024.0) { + /* for large values, fall back to a slower but more accurate version */ + return exp (log (x) * (1.0 / 2.4)); + } + y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383); x = sqrt (x); /* newton's method for x^(-1/6) */ z = (1./6.) * x; @@ -102,8 +112,13 @@ init_newtonf (float x, float exponent, float c0, float c1, float c2) float babl_pow_24f (float x) { - float y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f); + float y; int i; + if (x > 16.0f) { + /* for large values, fall back to a slower but more accurate version */ + return expf (logf (x) * 2.4f); + } + y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f); for (i = 0; i < 3; i++) y = (1.f+1.f/5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y)); x *= y; @@ -115,9 +130,14 @@ babl_pow_24f (float x) float babl_pow_1_24f (float x) { - float y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f); + float y; int i; float z; + if (x > 1024.0f) { + /* for large values, fall back to a slower but more accurate version */ + return expf (logf (x) * (1.0f / 2.4f)); + } + y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f); x = sqrtf (x); /* newton's method for x^(-1/6) */ z = (1.f/6.f) * x; diff --git a/babl/base/pow-24.h b/babl/base/pow-24.h index a55c029..ed42cfe 100644 --- a/babl/base/pow-24.h +++ b/babl/base/pow-24.h @@ -54,8 +54,13 @@ init_newton (double x, double exponent, double c0, double c1, double c2) static inline double babl_pow_24 (double x) { - double y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332); + double y; int i; + if (x > 16.0) { + /* for large values, fall back to a slower but more accurate version */ + return exp (log (x) * 2.4); + } + y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332); for (i = 0; i < 3; i++) y = (1.+1./5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y)); x *= y; @@ -67,9 +72,14 @@ babl_pow_24 (double x) static inline double babl_pow_1_24 (double x) { - double y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383); + double y; int i; double z; + if (x > 1024.0) { + /* for large values, fall back to a slower but more accurate version */ + return exp (log (x) * (1.0 / 2.4)); + } + y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383); x = sqrt (x); /* newton's method for x^(-1/6) */ z = (1./6.) * x; @@ -133,8 +143,13 @@ init_newtonf (float x, float exponent, float c0, float c1, float c2) static inline float babl_pow_24f (float x) { - float y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f); + float y; int i; + if (x > 16.0f) { + /* for large values, fall back to a slower but more accurate version */ + return expf (logf (x) * 2.4f); + } + y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f); for (i = 0; i < 3; i++) y = (1.f+1.f/5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y)); x *= y; @@ -146,9 +161,14 @@ babl_pow_24f (float x) static inline float babl_pow_1_24f (float x) { - float y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f); + float y; int i; float z; + if (x > 1024.0f) { + /* for large values, fall back to a slower but more accurate version */ + return expf (logf (x) * (1.0f / 2.4f)); + } + y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f); x = sqrtf (x); /* newton's method for x^(-1/6) */ z = (1.f/6.f) * x; diff --git a/extensions/sse2-float.c b/extensions/sse2-float.c index d26073c..223f85c 100644 --- a/extensions/sse2-float.c +++ b/extensions/sse2-float.c @@ -237,6 +237,22 @@ conv_rgbAF_linear_rgbaF_linear_spin (const Babl *conversion,const float *src, fl #define FLT_ONE 0x3f800000 // ((union {float f; int i;}){1.0f}).i #define FLT_MANTISSA (1<<23) +static inline float +sse_max_component (__v4sf x) { + __v4sf s; + __v4sf m; + + /* m = [max (x[3], x[1]), max (x[2], x[0])] */ + s = (__v4sf) _mm_shuffle_epi32 ((__m128i) x, _MM_SHUFFLE(0, 0, 3, 2)); + m = _mm_max_ps (x, s); + + /* m = [max (m[1], m[0])] = [max (max (x[3], x[1]), max (x[2], x[0]))] */ + s = (__v4sf) _mm_shuffle_epi32 ((__m128i) m, _MM_SHUFFLE(0, 0, 0, 1)); + m = _mm_max_ps (m, s); + + return m[0]; +} + static inline __v4sf sse_init_newton (__v4sf x, double exponent, double c0, double c1, double c2) { @@ -249,6 +265,13 @@ static inline __v4sf sse_pow_1_24 (__v4sf x) { __v4sf y, z; + if (sse_max_component (x) > 1024.0f) { + /* for large values, fall back to a slower but more accurate version */ + return _mm_set_ps (expf (logf (x[3]) * (1.0f / 2.4f)), + expf (logf (x[2]) * (1.0f / 2.4f)), + expf (logf (x[1]) * (1.0f / 2.4f)), + expf (logf (x[0]) * (1.0f / 2.4f))); + } y = sse_init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383); x = _mm_sqrt_ps (x); /* newton's method for x^(-1/6) */ @@ -262,6 +285,13 @@ static inline __v4sf sse_pow_24 (__v4sf x) { __v4sf y, z; + if (sse_max_component (x) > 16.0f) { + /* for large values, fall back to a slower but more accurate version */ + return _mm_set_ps (expf (logf (x[3]) * 2.4f), + expf (logf (x[2]) * 2.4f), + expf (logf (x[1]) * 2.4f), + expf (logf (x[0]) * 2.4f)); + } y = sse_init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332); /* newton's method for x^(-1/5) */ z = splat4f (1.f/5.f) * x; -- 2.30.2